Load a couple libraries

library(tidyverse)
library(glue) # similar to paste() but more powerful

Create some fake visit data

Creating fake data is really helpful for creating a reproducible example (aka reprex)

npats <- 100
nvisits <- 10
create_visit_df <- function(visit_number) {
  tibble(
    patid  = 1:npats,
    visitdate = lubridate::as_date(sample(0:1e4, 100)),
    blooddraw = rbinom(npats, 1, prob = .7),
    bloodrawwho_nurse = blooddraw*rbinom(npats, 1, prob = .7),
    bloodrawwho_other = blooddraw*(1-bloodrawwho_nurse)
  ) %>% 
    rename_at(vars(-patid), ~paste0(., "_", visit_number))
}

visit_wide <- map(1:nvisits, create_visit_df) %>% 
  reduce(inner_join, by = "patid")

One row per patient with grouped columns

The goal: One row per visit

## # A tibble: 1,000 x 5
##    patid visitdate  blooddraw bloodrawwho_nurse bloodrawwho_other
##    <int> <date>         <int>             <int>             <dbl>
##  1     1 1980-01-19         0                 0                 0
##  2     1 1987-12-02         0                 0                 0
##  3     1 1992-09-22         0                 0                 0
##  4     1 1993-11-20         1                 1                 0
##  5     1 1985-02-24         1                 1                 0
##  6     1 1981-02-25         0                 0                 0
##  7     1 1989-09-01         0                 0                 0
##  8     1 1996-11-26         1                 1                 0
##  9     1 1983-08-20         1                 1                 0
## 10     1 1983-09-09         0                 0                 0
## # … with 990 more rows

Visually…

Lists

Lists can contain anything

random_stuff <- list(mtcars,
                     qplot(mpg, cyl, data = mtcars), 
                     function(x) x^2, 
                     list("a", "b", "c"))
str(random_stuff, max.level = 1)
## List of 4
##  $ :'data.frame':    32 obs. of  11 variables:
##  $ :List of 9
##   ..- attr(*, "class")= chr [1:2] "gg" "ggplot"
##  $ :function (x)  
##   ..- attr(*, "srcref")= 'srcref' int [1:8] 4 22 4 36 22 36 4 4
##   .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x629eba8> 
##  $ :List of 3

More complex objects in R are often just lists.

Lists that contain objects of one type are especially useful

list(1:3, 4:10, seq(0, 20, by = 2))
## [[1]]
## [1] 1 2 3
## 
## [[2]]
## [1]  4  5  6  7  8  9 10
## 
## [[3]]
##  [1]  0  2  4  6  8 10 12 14 16 18 20

Apply a function to each element of a list

Put the output of the element in a new list

numeric_list <- list(1:3, 4:10, seq(0, 20, by = 2))

map(numeric_list, sum)
## [[1]]
## [1] 6
## 
## [[2]]
## [1] 49
## 
## [[3]]
## [1] 110

Apply a custom function

numeric_list <- list(1:3, 4:10, seq(0, 20, by = 2))

L2norm <- function(x) sqrt(sum(x^2))

map(numeric_list, L2norm)
## [[1]]
## [1] 3.741657
## 
## [[2]]
## [1] 19.26136
## 
## [[3]]
## [1] 39.24283

Create a custom function on the fly

Also called an anonymous function since it has no name

numeric_list <- list(1:3, 4:10, seq(0, 20, by = 2))

map(numeric_list, function(x) sqrt(sum(x^2)))
## [[1]]
## [1] 3.741657
## 
## [[2]]
## [1] 19.26136
## 
## [[3]]
## [1] 39.24283

Use the ~ . shortcut to create a custom function on the fly

This shortcut only works in the tidyverse

numeric_list <- list(1:3, 4:10, seq(0, 20, by = 2))

map(numeric_list, ~sqrt(sum(.^2)))
## [[1]]
## [1] 3.741657
## 
## [[2]]
## [1] 19.26136
## 
## [[3]]
## [1] 39.24283

Reduce a list to a single value

Use a function that takes two arguments and returns one value to reduce a list.

+ is such a function

numeric_list <- list(1:3, 4:10, seq(0, 20, by = 2))

map(numeric_list, ~sqrt(sum(.^2))) %>% 
  reduce(`+`)
## [1] 62.24585

Another example using glue for string concatenation

Anything inside {} is evaluated as code

map(1:3, ~glue("_{.}")) %>% 
  reduce(c)
## [1] "_1" "_2" "_3"

Variants of map will do the reduce step for us

map_chr returns a character vector.

There are many other variants of the map function.

map_chr(1:3, ~glue("_{.}")) 
## [1] "_1" "_2" "_3"

And just for the fun of it…

Ok.. Back to the problem at hand

We can easily select all variables for one visit

select(visit_wide, patid, ends_with("_1"))
## # A tibble: 100 x 5
##    patid visitdate_1 blooddraw_1 bloodrawwho_nurse_1 bloodrawwho_other_1
##    <int> <date>            <int>               <int>               <dbl>
##  1     1 1980-01-19            0                   0                   0
##  2     2 1974-05-20            0                   0                   0
##  3     3 1974-04-05            1                   1                   0
##  4     4 1977-03-26            1                   1                   0
##  5     5 1977-02-19            1                   1                   0
##  6     6 1970-01-21            0                   0                   0
##  7     7 1976-04-11            1                   1                   0
##  8     8 1981-08-23            1                   0                   1
##  9     9 1987-01-31            1                   1                   0
## 10    10 1988-03-30            1                   0                   1
## # … with 90 more rows

Split the dataframe into a list of dataframes

df_list <- map(1:nvisits, ~select(visit_wide, patid, ends_with(glue("_{.}"))))

str(df_list, max.level = 1)
## List of 10
##  $ :Classes 'tbl_df', 'tbl' and 'data.frame':    100 obs. of  5 variables:
##  $ :Classes 'tbl_df', 'tbl' and 'data.frame':    100 obs. of  5 variables:
##  $ :Classes 'tbl_df', 'tbl' and 'data.frame':    100 obs. of  5 variables:
##  $ :Classes 'tbl_df', 'tbl' and 'data.frame':    100 obs. of  5 variables:
##  $ :Classes 'tbl_df', 'tbl' and 'data.frame':    100 obs. of  5 variables:
##  $ :Classes 'tbl_df', 'tbl' and 'data.frame':    100 obs. of  5 variables:
##  $ :Classes 'tbl_df', 'tbl' and 'data.frame':    100 obs. of  5 variables:
##  $ :Classes 'tbl_df', 'tbl' and 'data.frame':    100 obs. of  5 variables:
##  $ :Classes 'tbl_df', 'tbl' and 'data.frame':    100 obs. of  5 variables:
##  $ :Classes 'tbl_df', 'tbl' and 'data.frame':    100 obs. of  5 variables:

The first visit

df_list[[1]]
## # A tibble: 100 x 5
##    patid visitdate_1 blooddraw_1 bloodrawwho_nurse_1 bloodrawwho_other_1
##    <int> <date>            <int>               <int>               <dbl>
##  1     1 1980-01-19            0                   0                   0
##  2     2 1974-05-20            0                   0                   0
##  3     3 1974-04-05            1                   1                   0
##  4     4 1977-03-26            1                   1                   0
##  5     5 1977-02-19            1                   1                   0
##  6     6 1970-01-21            0                   0                   0
##  7     7 1976-04-11            1                   1                   0
##  8     8 1981-08-23            1                   0                   1
##  9     9 1987-01-31            1                   1                   0
## 10    10 1988-03-30            1                   0                   1
## # … with 90 more rows

The second visit

df_list[[2]]
## # A tibble: 100 x 5
##    patid visitdate_2 blooddraw_2 bloodrawwho_nurse_2 bloodrawwho_other_2
##    <int> <date>            <int>               <int>               <dbl>
##  1     1 1987-12-02            0                   0                   0
##  2     2 1996-10-09            1                   1                   0
##  3     3 1994-12-10            0                   0                   0
##  4     4 1977-01-15            0                   0                   0
##  5     5 1986-03-12            1                   1                   0
##  6     6 1979-05-03            1                   1                   0
##  7     7 1973-11-08            0                   0                   0
##  8     8 1976-05-09            1                   1                   0
##  9     9 1974-08-21            1                   1                   0
## 10    10 1989-08-11            1                   1                   0
## # … with 90 more rows

We need to rename the columns

df_list2 <- map(df_list, ~rename_all(., ~str_remove(., "_[:digit:]+$")))

map(df_list2, names)
## [[1]]
## [1] "patid"             "visitdate"         "blooddraw"        
## [4] "bloodrawwho_nurse" "bloodrawwho_other"
## 
## [[2]]
## [1] "patid"             "visitdate"         "blooddraw"        
## [4] "bloodrawwho_nurse" "bloodrawwho_other"
## 
## [[3]]
## [1] "patid"             "visitdate"         "blooddraw"        
## [4] "bloodrawwho_nurse" "bloodrawwho_other"
## 
## [[4]]
## [1] "patid"             "visitdate"         "blooddraw"        
## [4] "bloodrawwho_nurse" "bloodrawwho_other"
## 
## [[5]]
## [1] "patid"             "visitdate"         "blooddraw"        
## [4] "bloodrawwho_nurse" "bloodrawwho_other"
## 
## [[6]]
## [1] "patid"             "visitdate"         "blooddraw"        
## [4] "bloodrawwho_nurse" "bloodrawwho_other"
## 
## [[7]]
## [1] "patid"             "visitdate"         "blooddraw"        
## [4] "bloodrawwho_nurse" "bloodrawwho_other"
## 
## [[8]]
## [1] "patid"             "visitdate"         "blooddraw"        
## [4] "bloodrawwho_nurse" "bloodrawwho_other"
## 
## [[9]]
## [1] "patid"             "visitdate"         "blooddraw"        
## [4] "bloodrawwho_nurse" "bloodrawwho_other"
## 
## [[10]]
## [1] "patid"             "visitdate"         "blooddraw"        
## [4] "bloodrawwho_nurse" "bloodrawwho_other"

Finally we reduce using bind_rows

bind_rows takes two dataframes and rowbinds them reducing the list with bindrows will iteratively apply bind_rows so we end up with one dataframe

reduce(df_list2, bind_rows)
## # A tibble: 1,000 x 5
##    patid visitdate  blooddraw bloodrawwho_nurse bloodrawwho_other
##    <int> <date>         <int>             <int>             <dbl>
##  1     1 1980-01-19         0                 0                 0
##  2     2 1974-05-20         0                 0                 0
##  3     3 1974-04-05         1                 1                 0
##  4     4 1977-03-26         1                 1                 0
##  5     5 1977-02-19         1                 1                 0
##  6     6 1970-01-21         0                 0                 0
##  7     7 1976-04-11         1                 1                 0
##  8     8 1981-08-23         1                 0                 1
##  9     9 1987-01-31         1                 1                 0
## 10    10 1988-03-30         1                 0                 1
## # … with 990 more rows

Using the pipe %>% we can do all this three lines

map(1:nvisits, ~select(visit_wide, patid, ends_with(glue("_{.}")))) %>% 
  map(~rename_all(., ~str_remove(., "_.$"))) %>% 
  reduce(bind_rows)
## # A tibble: 1,000 x 9
##    patid visitdate  blooddraw bloodrawwho_nur… bloodrawwho_oth…
##    <int> <date>         <int>            <int>            <dbl>
##  1     1 1980-01-19         0                0                0
##  2     2 1974-05-20         0                0                0
##  3     3 1974-04-05         1                1                0
##  4     4 1977-03-26         1                1                0
##  5     5 1977-02-19         1                1                0
##  6     6 1970-01-21         0                0                0
##  7     7 1976-04-11         1                1                0
##  8     8 1981-08-23         1                0                1
##  9     9 1987-01-31         1                1                0
## 10    10 1988-03-30         1                0                1
## # … with 990 more rows, and 4 more variables: visitdate_10 <date>,
## #   blooddraw_10 <int>, bloodrawwho_nurse_10 <int>,
## #   bloodrawwho_other_10 <dbl>

Using one of the map variants we can do it in two lines

map(1:nvisits, ~select(visit_wide, patid, ends_with(glue("_{.}")))) %>% 
  map_dfr(~rename_all(., ~str_remove(., "_[:digit:]+$"))) 
## # A tibble: 1,000 x 5
##    patid visitdate  blooddraw bloodrawwho_nurse bloodrawwho_other
##    <int> <date>         <int>             <int>             <dbl>
##  1     1 1980-01-19         0                 0                 0
##  2     2 1974-05-20         0                 0                 0
##  3     3 1974-04-05         1                 1                 0
##  4     4 1977-03-26         1                 1                 0
##  5     5 1977-02-19         1                 1                 0
##  6     6 1970-01-21         0                 0                 0
##  7     7 1976-04-11         1                 1                 0
##  8     8 1981-08-23         1                 0                 1
##  9     9 1987-01-31         1                 1                 0
## 10    10 1988-03-30         1                 0                 1
## # … with 990 more rows

It even works with a database

Create a SQLite database in memory and load our visit data into it

library(DBI)
con <- dbConnect(RSQLite::SQLite(), ":memory:")
dbWriteTable(con, "visit_wide", visit_wide)
visit_db <- tbl(con, "visit_wide")

R writes the SQL for us and sends it to the database

We just change bind_rows to union_all

map(1:nvisits, ~select(visit_db, patid, ends_with(glue("_{.}")))) %>% 
  map(~rename_all(., ~str_remove(., "_[:digit:]+$"))) %>% 
  reduce(union_all)
## # Source:   lazy query [?? x 5]
## # Database: sqlite 3.22.0 [:memory:]
##    patid visitdate blooddraw bloodrawwho_nurse bloodrawwho_other
##    <int>     <dbl>     <int>             <int>             <dbl>
##  1     1      3670         0                 0                 0
##  2     2      1600         0                 0                 0
##  3     3      1555         1                 1                 0
##  4     4      2641         1                 1                 0
##  5     5      2606         1                 1                 0
##  6     6        20         0                 0                 0
##  7     7      2292         1                 1                 0
##  8     8      4252         1                 0                 1
##  9     9      6239         1                 1                 0
## 10    10      6663         1                 0                 1
## # … with more rows

map saved us from having to write a lot of SQL

map(1:nvisits, ~select(visit_db, patid, ends_with(glue("_{.}")))) %>% 
  map(~rename_all(., ~str_remove(., "_[:digit:]+$"))) %>% 
  reduce(union_all) %>% 
  show_query()
## <SQL>
## SELECT `patid`, `visitdate_1` AS `visitdate`, `blooddraw_1` AS `blooddraw`, `bloodrawwho_nurse_1` AS `bloodrawwho_nurse`, `bloodrawwho_other_1` AS `bloodrawwho_other`
## FROM `visit_wide`
## UNION ALL
## SELECT `patid`, `visitdate_2` AS `visitdate`, `blooddraw_2` AS `blooddraw`, `bloodrawwho_nurse_2` AS `bloodrawwho_nurse`, `bloodrawwho_other_2` AS `bloodrawwho_other`
## FROM `visit_wide`
## UNION ALL
## SELECT `patid`, `visitdate_3` AS `visitdate`, `blooddraw_3` AS `blooddraw`, `bloodrawwho_nurse_3` AS `bloodrawwho_nurse`, `bloodrawwho_other_3` AS `bloodrawwho_other`
## FROM `visit_wide`
## UNION ALL
## SELECT `patid`, `visitdate_4` AS `visitdate`, `blooddraw_4` AS `blooddraw`, `bloodrawwho_nurse_4` AS `bloodrawwho_nurse`, `bloodrawwho_other_4` AS `bloodrawwho_other`
## FROM `visit_wide`
## UNION ALL
## SELECT `patid`, `visitdate_5` AS `visitdate`, `blooddraw_5` AS `blooddraw`, `bloodrawwho_nurse_5` AS `bloodrawwho_nurse`, `bloodrawwho_other_5` AS `bloodrawwho_other`
## FROM `visit_wide`
## UNION ALL
## SELECT `patid`, `visitdate_6` AS `visitdate`, `blooddraw_6` AS `blooddraw`, `bloodrawwho_nurse_6` AS `bloodrawwho_nurse`, `bloodrawwho_other_6` AS `bloodrawwho_other`
## FROM `visit_wide`
## UNION ALL
## SELECT `patid`, `visitdate_7` AS `visitdate`, `blooddraw_7` AS `blooddraw`, `bloodrawwho_nurse_7` AS `bloodrawwho_nurse`, `bloodrawwho_other_7` AS `bloodrawwho_other`
## FROM `visit_wide`
## UNION ALL
## SELECT `patid`, `visitdate_8` AS `visitdate`, `blooddraw_8` AS `blooddraw`, `bloodrawwho_nurse_8` AS `bloodrawwho_nurse`, `bloodrawwho_other_8` AS `bloodrawwho_other`
## FROM `visit_wide`
## UNION ALL
## SELECT `patid`, `visitdate_9` AS `visitdate`, `blooddraw_9` AS `blooddraw`, `bloodrawwho_nurse_9` AS `bloodrawwho_nurse`, `bloodrawwho_other_9` AS `bloodrawwho_other`
## FROM `visit_wide`
## UNION ALL
## SELECT `patid`, `visitdate_10` AS `visitdate`, `blooddraw_10` AS `blooddraw`, `bloodrawwho_nurse_10` AS `bloodrawwho_nurse`, `bloodrawwho_other_10` AS `bloodrawwho_other`
## FROM `visit_wide`

The takeaway

The map and reduce functions allow us to program on a higher level of abstraction.

The scoped variants of dplyr work much the same way.

mutate_if(); mutate_at(); mutate_all()
select_if(); select_at(); select_all()
filter_if(); filter_at(); filter_all()
rename_if(); rename_at(); rename_all()
summarise_if(); summarise_at(); summarise_all()

These functions map over a dataframe which is a list of columns.

is.list(mtcars)
## [1] TRUE